In [1]:
%matplotlib inline

Lightweight Corpus Analysis


In [3]:
import csv 

import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

sns.set_context('talk')
sns.set_style('whitegrid')

In [16]:
def get_data(path='../fixtures/sizes.csv'):
    with open(path, 'r') as f:
        reader = csv.reader(f)
        return pd.DataFrame([
                {
                    'category': row[0].split('/')[0],
                    'size': int(row[1]),
                }
                for row in reader if len(row) > 1
            ])

data = get_data()
data['size'].mean()


Out[16]:
228281.10466225282

In [12]:
sns.distplot(data['size'], rug=False)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x207997ed0>

In [13]:
data.boxplot()


/usr/local/lib/python2.7/site-packages/pandas/tools/plotting.py:2633: FutureWarning: 
The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.
  warnings.warn(msg, FutureWarning)
Out[13]:
{'boxes': [<matplotlib.lines.Line2D at 0x2085557d0>],
 'caps': [<matplotlib.lines.Line2D at 0x2085646d0>,
  <matplotlib.lines.Line2D at 0x208564d10>],
 'fliers': [<matplotlib.lines.Line2D at 0x20856f9d0>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x20856f390>],
 'whiskers': [<matplotlib.lines.Line2D at 0x208555990>,
  <matplotlib.lines.Line2D at 0x208564090>]}